Data intro

library(hexbin)
mdata = read.csv("movie_rt_transform.csv")
head(mdata)
##   X     gross                                                    genres
## 1 1 760505847                           Action|Adventure|Fantasy|Sci-Fi
## 2 2 309404152                                  Action|Adventure|Fantasy
## 3 4 448130642                                           Action|Thriller
## 4 6  73058679                                   Action|Adventure|Sci-Fi
## 5 7 336530303                                  Action|Adventure|Romance
## 6 8 200807262 Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance
##                                 movie_title
## 1                                   Avatar 
## 2 Pirates of the Caribbean: At World's End 
## 3                    The Dark Knight Rises 
## 4                              John Carter 
## 5                             Spider-Man 3 
## 6                                  Tangled 
##                                        movie_imdb_link    budget
## 1 http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1 237000000
## 2 http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1 300000000
## 3 http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1 250000000
## 4 http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1 263700000
## 5 http://www.imdb.com/title/tt0413300/?ref_=fn_tt_tt_1 258000000
## 6 http://www.imdb.com/title/tt0398286/?ref_=fn_tt_tt_1 260000000
##   title_year imdb_score Action Adventure Animation Biography Comedy Crime
## 1       2009        7.9      1         1         0         0      0     0
## 2       2007        7.1      1         1         0         0      0     0
## 3       2012        8.5      1         0         0         0      0     0
## 4       2012        6.6      1         1         0         0      0     0
## 5       2007        6.2      1         1         0         0      0     0
## 6       2010        7.8      0         1         1         0      1     0
##   Documentary Drama Family Fantasy History Horror Music Musical Mystery
## 1           0     0      0       1       0      0     0       0       0
## 2           0     0      0       1       0      0     0       0       0
## 3           0     0      0       0       0      0     0       0       0
## 4           0     0      0       0       0      0     0       0       0
## 5           0     0      0       0       0      0     0       0       0
## 6           0     0      1       1       0      0     0       1       0
##   Romance Sci.Fi Sport Thriller War Western rt_score  t_gross t_budget
## 1       0      1     0        0   0       0       83 660.2562 492.3032
## 2       0      0     0        0   0       0       45 526.5075 522.4296
## 3       0      0     0        1   0       0       87 577.9841 498.9734
## 4       0      1     0        0   0       0       51 365.8094 505.7269
## 5       1      0     0        0   0       0       63 537.7714 502.9498
## 6       1      0     0        0   0       0       89 472.1621 503.9294
##   t_imdb_score  t_rt_score
## 1          7.9  0.94133219
## 2          7.1 -0.12441070
## 3          8.5  1.10923983
## 4          6.6  0.02482065
## 5          6.2  0.32844628
## 6          7.8  1.20624569

Exploratory analysis

hist(mdata$gross, main="Histogram of Gross", xlab="Gross")

hist(mdata$budget, main="Histogram of Budget", xlab="Budget")

hist(mdata$imdb_score, main="Histogram of IMDB Scores", xlab="IMDB Score")

hist(mdata$rt_score, main="Histogram of Rotten Tomato Scores", xlab="Rotten Tomato Score")

t_rt_score = qnorm((mdata$rt_score+0.5)/101,0)
hist(t_rt_score)

plot(hexbin(mdata$gross, mdata$budget), main="Gross vs. Budget" , legend=0, xlab="Gross (USD)", ylab="Budget (USD)")

plot(hexbin(mdata$imdb_score, mdata$rt_score), main="IMDB Score vs. RT Score" , legend=0, xlab="IMDB Score", ylab="RT Score")

plot(hexbin(log(mdata$gross), log(mdata$budget)), main="Gross vs. Budget" , legend=0, xlab="log gross", ylab="log budget")

Univariate summaries

mdata3 = mdata
mdata3$norm_gross = mdata$gross/max(mdata$gross)
mdata3$norm_budget = mdata$budget/max(mdata$budget)
mdata3$norm_imdb_score = mdata$imdb_score/max(mdata$imdb_score)
mdata3$norm_rt_score = mdata$rt_score/max(mdata$rt_score)
boxplot(mdata3[,c(35, 36, 37, 38)], main="Univariate Summary")

Pairwise scatter plots

First the untransformed scatter plot.

mdata2 = mdata
hexplom(~mdata2[,c(2,6,8,30)])

Then transform gross and profit by log transform.

mdata2$log_gross = log(mdata2$gross)
mdata2$log_budget = log(mdata2$budget)
hexplom(~mdata2[,c(36,35,8,30)])

Residuals for simple models

model = lm(mdata$gross~mdata$budget+mdata$imdb_score+mdata$rt_score)
summary(model)
## 
## Call:
## lm(formula = mdata$gross ~ mdata$budget + mdata$imdb_score + 
##     mdata$rt_score)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -229627824  -26241779   -7469321   16632442  468357060 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -4.390e+07  1.054e+07  -4.167 3.29e-05 ***
## mdata$budget      1.103e+00  2.880e-02  38.309  < 2e-16 ***
## mdata$imdb_score  5.625e+06  2.020e+06   2.784  0.00544 ** 
## mdata$rt_score    3.628e+05  7.613e+04   4.765 2.09e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54490000 on 1325 degrees of freedom
## Multiple R-squared:  0.5611, Adjusted R-squared:  0.5601 
## F-statistic: 564.7 on 3 and 1325 DF,  p-value: < 2.2e-16
plot(model$fitted, model$resid, main="residual plot", xlab="Fitted", ylab="Residuals")

model = lm(log(mdata$gross)~log(mdata$budget)+mdata$imdb_score+mdata$rt_score)
summary(model)
## 
## Call:
## lm(formula = log(mdata$gross) ~ log(mdata$budget) + mdata$imdb_score + 
##     mdata$rt_score)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4637 -0.5004  0.1653  0.7933  9.0791 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.633252   0.534917  -3.053  0.00231 ** 
## log(mdata$budget)  1.055704   0.028726  36.751  < 2e-16 ***
## mdata$imdb_score   0.004319   0.056501   0.076  0.93908    
## mdata$rt_score     0.010511   0.002146   4.898 1.09e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.524 on 1325 degrees of freedom
## Multiple R-squared:  0.5162, Adjusted R-squared:  0.5151 
## F-statistic: 471.3 on 3 and 1325 DF,  p-value: < 2.2e-16
plot(model$fitted, model$resid, main="residual plot", xlab="Fitted", ylab="Residuals")

Power Transform

Try exact transformation.

library(car)
## Warning: package 'car' was built under R version 3.3.2
ans <- powerTransform(cbind(mdata$gross, mdata$budget, 
                            mdata$imdb_score, mdata$rt_score+.01) ~ 1)
powerTransform(cbind(mdata$gross, mdata$budget, 
                     mdata$imdb_score, mdata$rt_score+.01) ~ 1)
## Estimated transformation parameters 
##        Y1        Y2        Y3        Y4 
## 0.2953805 0.2549329 2.4030884 0.7905505
summary(ans)
## bcPower Transformations to Multinormality 
##    Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## Y1    0.2954   0.0109           0.2740           0.3167
## Y2    0.2549   0.0140           0.2276           0.2823
## Y3    2.4031   0.1096           2.1882           2.6180
## Y4    0.7906   0.0371           0.7179           0.8632
## 
## Likelihood ratio tests about transformation parameters
##                                            LRT df pval
## LR test, lambda = (0 0 0 0)           2827.280  4    0
## LR test, lambda = (1 1 1 1)           3936.344  4    0
## LR test, lambda = (0.3 0.25 2.4 0.79)    0.000  4    1
gross_trans = (mdata$gross^0.295 - 1)/0.295
budget_trans = (mdata$budget^0.255 - 1)/0.255
imdb_trans = (mdata$imdb_score^2.4 - 1)/2.4
rt_trans = (mdata$rt_score^.79 - 1)/.79
model = lm(gross_trans~budget_trans+imdb_trans+rt_trans)
hexplom(~cbind(gross_trans,budget_trans,imdb_trans,rt_trans))

summary(model)
## 
## Call:
## lm(formula = gross_trans ~ budget_trans + imdb_trans + rt_trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -573.78 -102.85   -0.81  100.33  705.12 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -130.90218   18.04851  -7.253 6.92e-13 ***
## budget_trans    1.87940    0.04392  42.793  < 2e-16 ***
## imdb_trans      1.38693    0.49153   2.822  0.00485 ** 
## rt_trans        2.26168    0.51584   4.384 1.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 159.4 on 1325 degrees of freedom
## Multiple R-squared:  0.6042, Adjusted R-squared:  0.6033 
## F-statistic: 674.2 on 3 and 1325 DF,  p-value: < 2.2e-16
plot(model$fitted, model$resid, main="residual plot", xlab="Fitted", ylab="Residuals")

Try approximate.

t_gross = (mdata$gross^0.25 - 1)/0.25
t_budget = (mdata$budget^0.25 - 1)/0.25
t_imdb_1 = mdata$imdb_score
t_imdb_2 = (mdata$imdb_score^2 - 1)/2
t_rt = mdata$rt_score
model1 = lm(t_gross~t_budget+t_imdb_1+t_imdb_2+t_rt)
hexplom(~cbind(t_gross,t_budget,t_imdb_2,t_rt))

summary(model1)
## 
## Call:
## lm(formula = t_gross ~ t_budget + t_imdb_1 + t_imdb_2 + t_rt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -284.00  -45.49    2.59   46.17  338.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 161.09001   44.27268   3.639 0.000285 ***
## t_budget      0.95733    0.02243  42.690  < 2e-16 ***
## t_imdb_1    -62.27125   14.35158  -4.339 1.54e-05 ***
## t_imdb_2     11.49206    2.47120   4.650 3.64e-06 ***
## t_rt          0.43296    0.11042   3.921 9.26e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.66 on 1324 degrees of freedom
## Multiple R-squared:  0.6037, Adjusted R-squared:  0.6025 
## F-statistic: 504.3 on 4 and 1324 DF,  p-value: < 2.2e-16
plot(model1$fitted, model1$resid, main="residual plot", xlab="Fitted", ylab="Residuals")

Normal transform of rotten tomatoes score.

ans <- powerTransform(cbind(mdata$gross, mdata$budget, mdata$imdb_score, t_rt_score-min(t_rt_score)+.01) ~ 1)
summary(ans)
## bcPower Transformations to Multinormality 
##    Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## Y1    0.2952   0.0109           0.2739           0.3165
## Y2    0.2543   0.0139           0.2270           0.2816
## Y3    2.4339   0.1108           2.2167           2.6510
## Y4    0.9676   0.0467           0.8761           1.0590
## 
## Likelihood ratio tests about transformation parameters
##                                             LRT df      pval
## LR test, lambda = (0 0 0 0)         2995.474884  4 0.0000000
## LR test, lambda = (1 1 1 1)         3899.397058  4 0.0000000
## LR test, lambda = (0.3 0.25 2.43 1)    0.480411  4 0.9753797
t_gross = (mdata$gross^0.25 - 1)/0.25
t_budget = (mdata$budget^0.25 - 1)/0.25
t_imdb_1 = mdata$imdb_score
t_imdb_2 = (mdata$imdb_score^2 - 1)/2
t_rt = t_rt_score
model2 = lm(t_gross~t_budget+t_imdb_1+t_imdb_2+t_rt)
hexplom(~cbind(t_gross,t_budget,t_imdb_2,t_rt))

summary(model2)
## 
## Call:
## lm(formula = t_gross ~ t_budget + t_imdb_1 + t_imdb_2 + t_rt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -283.68  -45.24    2.62   46.14  338.89 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 187.0276    43.6062   4.289 1.92e-05 ***
## t_budget      0.9579     0.0224  42.763  < 2e-16 ***
## t_imdb_1    -63.1530    14.2745  -4.424 1.05e-05 ***
## t_imdb_2     11.5402     2.4501   4.710 2.74e-06 ***
## t_rt         14.6784     3.4910   4.205 2.79e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74.6 on 1324 degrees of freedom
## Multiple R-squared:  0.6044, Adjusted R-squared:  0.6032 
## F-statistic: 505.7 on 4 and 1324 DF,  p-value: < 2.2e-16
plot(hexbin(model2$fitted, model2$resid), main="Residual Plot", xlab="Fitted", ylab="Residuals", legend=0)

par(mfrow=c(1,1))
plot(hexbin(t_gross, t_budget), main="Gross vs. Budget" , legend=0, xlab="Transformed Gross", ylab="Transformed Budget")

Initial regression

rt=mdata
genre_old=rt[,c(9:29)]
Music= as.numeric(genre_old$Music | genre_old$Musical)
genre=cbind(genre_old[,-c(13:14)],Music)

data_initial=rt[,c(2,6,8,10:30)]
data_initial=cbind(data_initial,as.factor(rt$title_year))
lm1=lm(gross~.,data=data_initial)
summary(lm1)
## 
## Call:
## lm(formula = gross ~ ., data = data_initial)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -236630412  -26076779   -7207365   16453032  460557534 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -5.838e+07  1.291e+07  -4.521 6.72e-06 ***
## budget                          1.097e+00  4.062e-02  27.001  < 2e-16 ***
## imdb_score                      8.658e+06  2.159e+06   4.011 6.40e-05 ***
## Adventure                      -7.292e+06  5.172e+06  -1.410  0.15884    
## Animation                       5.613e+06  7.699e+06   0.729  0.46610    
## Biography                       5.138e+06  7.348e+06   0.699  0.48450    
## Comedy                          8.833e+06  4.102e+06   2.153  0.03147 *  
## Crime                          -6.446e+06  4.799e+06  -1.343  0.17940    
## Documentary                    -1.827e+07  1.093e+07  -1.672  0.09470 .  
## Drama                          -1.121e+07  3.901e+06  -2.872  0.00414 ** 
## Family                         -4.552e+06  6.022e+06  -0.756  0.44989    
## Fantasy                        -7.261e+05  4.776e+06  -0.152  0.87918    
## History                        -7.774e+06  1.020e+07  -0.762  0.44618    
## Horror                          6.477e+06  5.873e+06   1.103  0.27030    
## Music                           1.009e+07  7.389e+06   1.365  0.17239    
## Musical                        -2.715e+06  1.039e+07  -0.261  0.79394    
## Mystery                        -1.456e+06  5.668e+06  -0.257  0.79738    
## Romance                         8.988e+05  4.117e+06   0.218  0.82724    
## Sci.Fi                          6.495e+06  5.093e+06   1.275  0.20244    
## Sport                          -8.764e+06  7.761e+06  -1.129  0.25900    
## Thriller                       -8.084e+05  4.422e+06  -0.183  0.85498    
## War                             7.136e+06  9.634e+06   0.741  0.45901    
## Western                        -1.459e+07  1.529e+07  -0.954  0.34037    
## rt_score                        3.673e+05  7.802e+04   4.707 2.78e-06 ***
## `as.factor(rt$title_year)`2007 -3.844e+06  6.807e+06  -0.565  0.57239    
## `as.factor(rt$title_year)`2008 -3.283e+06  6.523e+06  -0.503  0.61488    
## `as.factor(rt$title_year)`2009  1.060e+06  6.516e+06   0.163  0.87085    
## `as.factor(rt$title_year)`2010 -3.376e+06  6.608e+06  -0.511  0.60949    
## `as.factor(rt$title_year)`2011 -7.857e+06  6.692e+06  -1.174  0.24055    
## `as.factor(rt$title_year)`2012  3.803e+06  6.644e+06   0.572  0.56719    
## `as.factor(rt$title_year)`2013 -5.225e+06  6.712e+06  -0.779  0.43638    
## `as.factor(rt$title_year)`2014  3.953e+06  6.754e+06   0.585  0.55845    
## `as.factor(rt$title_year)`2015  2.968e+06  7.045e+06   0.421  0.67357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54160000 on 1296 degrees of freedom
## Multiple R-squared:  0.5759, Adjusted R-squared:  0.5655 
## F-statistic:    55 on 32 and 1296 DF,  p-value: < 2.2e-16

Treat Genres

Transformation

trans1=read.csv("movie_rt_transform.csv")
continious_trans1=trans1[,-c(1:30)]
continious_trans1=cbind(continious_trans1,continious_trans1$t_imdb_score^2,as.factor(rt$title_year))
names(continious_trans1)[c(5,6)]=c("t_imdb_score2","year")

Model selection

library(corrplot)
## cluster into 3 groups
corrplot(cor(genre),order="hclust",addrect=3,method="color")

c3_1=as.numeric(genre$Comedy|genre$Romance|genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music)  #8
c3_2=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c3_3=as.numeric(genre$Western|genre$Crime|genre$Thriller|genre$Action|genre$Sci.Fi|genre$Horror|genre$Mystery)   #7
c3=cbind(c3_1,c3_2,c3_3)
colSums(c3)
## c3_1 c3_2 c3_3 
##  906  157  672
## cluster into 4 groups
corrplot(cor(genre),order="hclust",addrect=4,method="color")

c4_1=as.numeric(genre$Comedy|genre$Romance)  #2
c4_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c4_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c4_4=as.numeric(genre$Western|genre$Crime|genre$Thriller|genre$Action|genre$Sci.Fi|genre$Horror|genre$Mystery)   #7
c4=cbind(c4_1,c4_2,c4_3,c4_4)

## cluster into 5 groups
corrplot(cor(genre),order="hclust",addrect=5,method="color")

c5_1=as.numeric(genre$Comedy|genre$Romance)  #2
c5_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c5_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c5_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c5_5=as.numeric(genre$Action|genre$Sci.Fi|genre$Horror|genre$Mystery)   #7
c5=cbind(c5_1,c5_2,c5_3,c5_4,c5_5)

## cluster into 6 groups
corrplot(cor(genre),order="hclust",addrect=6,method="color")

c6_1=as.numeric(genre$Comedy|genre$Romance)  #2
c6_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c6_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c6_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c6_5=as.numeric(genre$Action|genre$Sci.Fi) #2
c6_6=as.numeric(genre$Horror|genre$Mystery)   #2
c6=cbind(c6_1,c6_2,c6_3,c6_4,c6_5,c6_6)

## cluster into 7 groups
corrplot(cor(genre),order="hclust",addrect=7,method="color")

c7_1=as.numeric(genre$Comedy|genre$Romance)  #2
c7_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family) #6
c7_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c7_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c7_5=as.numeric(genre$Action|genre$Sci.Fi) #2
c7_6=as.numeric(genre$Horror|genre$Mystery)   #2
c7_7=as.numeric(genre$Documentary|genre$Music)
c7=cbind(c7_1,c7_2,c7_3,c7_4,c7_5,c7_6,c7_7)

data_c3=cbind(continious_trans1,c3,rt$title_year)
lm_c3=lm(t_gross~.,data=as.data.frame(data_c3))
s3=summary(lm_c3)
plot(lm_c3$fitted.values,lm_c3$residuals)

data_c4=cbind(continious_trans1,c4)
lm_c4=lm(t_gross~.,data=as.data.frame(data_c4))
s4=summary(lm_c4)

data_c5=cbind(continious_trans1,c5)
lm_c5=lm(t_gross~.,data=as.data.frame(data_c5))
s5=summary(lm_c5)

data_c6=cbind(continious_trans1,c6)
lm_c6=lm(t_gross~.,data=as.data.frame(data_c6))
s6=summary(lm_c6)

data_c7=cbind(continious_trans1,c7)
lm_c7=lm(t_gross~.,data=as.data.frame(data_c7))
s7=summary(lm_c7)

data_all=cbind(continious_trans1,genre_old)
lm_all=lm(t_gross~.,data=as.data.frame(data_all))
s_all=summary(lm_all)

bic=BIC(lm_c3,lm_c4,lm_c5,lm_c6,lm_c7,lm_all)

BIC suggests c6

Diagnostics

Influential Points

We calculated cook’s distance for the model (shown as below). The maximum cook’s distance is around 0.025, which is quite small.

## 
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
## 
##     logit, vif

Outliers

The range for standardized residuals is:

## [1] -4.629635  4.674346

QQ plot for standardized residuals:

Points with large standardized residuals:

##      fitted(lm_c6)          movie_title  t_gross  t_budget t_imdb_score
## 349      380.58399       Winter's Tale   44.9631 348.04469          6.2
## 1298      65.02993 Paranormal Activity  403.6925  40.26728          6.3
##      t_rt_score   studres    stdres
## 349  -1.1092398 -4.666226 -4.629635
## 1298  0.9413322  4.712052  4.674346

Dummy variable for Paranormal Activity: (it is significant; this movie is an outlier).

## 
## Call:
## lm(formula = t_gross ~ . + D1, data = as.data.frame(data_c6))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -325.56  -43.96   -0.32   46.11  209.93 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -4.989e+03  1.448e+03  -3.446 0.000587 ***
## t_budget                   9.887e-01  2.603e-02  37.982  < 2e-16 ***
## t_imdb_score              -7.571e+01  1.403e+01  -5.396 8.09e-08 ***
## t_rt_score                 1.280e+01  3.445e+00   3.715 0.000212 ***
## t_imdb_score2              7.139e+00  1.209e+00   5.905 4.48e-09 ***
## `movie_rt_transform[, 7]`  2.577e+00  7.213e-01   3.572 0.000367 ***
## c6_1                       1.643e+01  4.862e+00   3.380 0.000746 ***
## c6_2                      -4.156e+00  4.718e+00  -0.881 0.378492    
## c6_3                      -1.990e+01  6.662e+00  -2.987 0.002873 ** 
## c6_4                      -3.453e+00  5.135e+00  -0.672 0.501388    
## c6_5                      -7.184e+00  5.374e+00  -1.337 0.181539    
## c6_6                       2.338e+01  6.038e+00   3.871 0.000114 ***
## D1                         3.402e+02  7.323e+01   4.645 3.73e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72.7 on 1316 degrees of freedom
## Multiple R-squared:  0.6266, Adjusted R-squared:  0.6232 
## F-statistic:   184 on 12 and 1316 DF,  p-value: < 2.2e-16

It turns out to be significant; it is an outlier.

Dummy viriable for Winter’s Tale: (it is significant; this movie is an outlier).

## 
## Call:
## lm(formula = t_gross ~ . + D2, data = as.data.frame(data_c6))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -259.68  -44.65    0.22   45.59  335.30 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -5044.9415  1449.1108  -3.481 0.000515 ***
## t_budget                      0.9829     0.0260  37.802  < 2e-16 ***
## t_imdb_score                -73.7175    14.0340  -5.253 1.75e-07 ***
## t_rt_score                   12.6463     3.4481   3.668 0.000255 ***
## t_imdb_score2                 6.9949     1.2091   5.785 9.05e-09 ***
## `movie_rt_transform[, 7]`     2.6017     0.7219   3.604 0.000325 ***
## c6_1                         16.8378     4.8671   3.460 0.000558 ***
## c6_2                         -3.5078     4.7235  -0.743 0.457848    
## c6_3                        -19.7999     6.6659  -2.970 0.003029 ** 
## c6_4                         -4.7369     5.1323  -0.923 0.356203    
## c6_5                         -7.0889     5.3762  -1.319 0.187538    
## c6_6                         27.0472     6.0530   4.468 8.56e-06 ***
## D2                         -330.1750    73.2520  -4.507 7.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72.73 on 1316 degrees of freedom
## Multiple R-squared:  0.6262, Adjusted R-squared:  0.6228 
## F-statistic: 183.7 on 12 and 1316 DF,  p-value: < 2.2e-16

It turns out to be significant; it is an outlier.

Model without influential points and outliers:

## 
## Call:
## lm(formula = t_gross ~ ., data = as.data.frame(data_c6), subset = (cook < 
##     0.015))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -258.602  -44.241    0.003   45.607  206.804 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -5.218e+03  1.438e+03  -3.629 0.000295 ***
## t_budget                   9.918e-01  2.584e-02  38.384  < 2e-16 ***
## t_imdb_score              -8.160e+01  1.434e+01  -5.689 1.57e-08 ***
## t_rt_score                 1.253e+01  3.434e+00   3.649 0.000273 ***
## t_imdb_score2              7.616e+00  1.226e+00   6.210 7.11e-10 ***
## `movie_rt_transform[, 7]`  2.698e+00  7.161e-01   3.768 0.000172 ***
## c6_1                       1.751e+01  4.827e+00   3.628 0.000297 ***
## c6_2                      -3.781e+00  4.689e+00  -0.806 0.420210    
## c6_3                      -1.970e+01  6.609e+00  -2.980 0.002933 ** 
## c6_4                      -3.737e+00  5.093e+00  -0.734 0.463251    
## c6_5                      -7.705e+00  5.331e+00  -1.445 0.148611    
## c6_6                       2.547e+01  6.009e+00   4.239 2.40e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 72.1 on 1314 degrees of freedom
## Multiple R-squared:  0.6317, Adjusted R-squared:  0.6286 
## F-statistic: 204.9 on 11 and 1314 DF,  p-value: < 2.2e-16

Censor

library(censReg)
## Loading required package: maxLik
## Loading required package: miscTools
## Warning: package 'miscTools' was built under R version 3.3.2
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
movie_rt_transform = mdata
continious_trans1=movie_rt_transform[,-c(1:30)]
continious_trans1=cbind(continious_trans1,continious_trans1$t_imdb_score^2,movie_rt_transform[,7])
names(continious_trans1)[5]="t_imdb_score2"
names(continious_trans1)[6]="year"
continious_trans1 = rbind(continious_trans1, 0)
continious_trans1$year = as.factor(continious_trans1$year)

genre_old=movie_rt_transform[,c(9:29)]
Music= as.numeric(genre_old$Music | genre_old$Musical)
genre=cbind(genre_old[,-c(13:14)],Music)

c6_1=as.numeric(genre$Comedy|genre$Romance)  #2
c6_2=as.numeric(genre$Fantasy|genre$Adventure|genre$Animation|genre$Family|genre$Documentary|genre$Music) #6
c6_3=as.numeric(genre$Sport|genre$War|genre$Biography|genre$History) #5
c6_4=as.numeric(genre$Western|genre$Crime|genre$Thriller) #3
c6_5=as.numeric(genre$Action|genre$Sci.Fi) #2
c6_6=as.numeric(genre$Horror|genre$Mystery)   #2
c6=cbind(c6_1,c6_2,c6_3,c6_4,c6_5,c6_6)
c6 = rbind(c6,0)

data_c6=cbind(continious_trans1,c6)

lm_c6=lm(data_c6$t_gross~.,data=as.data.frame(data_c6))
summary(lm_c6)
## 
## Call:
## lm(formula = data_c6$t_gross ~ ., data = as.data.frame(data_c6))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -335.62  -45.60    0.31   46.17  338.66 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -4.289e-12  7.322e+01   0.000 1.000000    
## t_budget       9.866e-01  2.627e-02  37.556  < 2e-16 ***
## t_imdb_score  -7.399e+01  1.415e+01  -5.230 1.97e-07 ***
## t_rt_score     1.394e+01  3.484e+00   4.002 6.64e-05 ***
## t_imdb_score2  6.939e+00  1.219e+00   5.692 1.55e-08 ***
## year2006       1.862e+02  8.455e+01   2.202 0.027817 *  
## year2007       1.774e+02  8.465e+01   2.096 0.036285 *  
## year2008       1.803e+02  8.458e+01   2.132 0.033191 *  
## year2009       1.819e+02  8.471e+01   2.148 0.031921 *  
## year2010       1.847e+02  8.476e+01   2.179 0.029490 *  
## year2011       1.831e+02  8.467e+01   2.162 0.030791 *  
## year2012       1.990e+02  8.481e+01   2.347 0.019074 *  
## year2013       1.991e+02  8.477e+01   2.349 0.018978 *  
## year2014       2.073e+02  8.474e+01   2.447 0.014543 *  
## year2015       1.905e+02  8.484e+01   2.246 0.024896 *  
## c6_1           1.653e+01  4.904e+00   3.371 0.000772 ***
## c6_2          -4.686e+00  4.761e+00  -0.984 0.325241    
## c6_3          -2.049e+01  6.748e+00  -3.036 0.002445 ** 
## c6_4          -4.309e+00  5.179e+00  -0.832 0.405524    
## c6_5          -6.840e+00  5.433e+00  -1.259 0.208275    
## c6_6           2.546e+01  6.101e+00   4.172 3.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.22 on 1309 degrees of freedom
## Multiple R-squared:  0.625,  Adjusted R-squared:  0.6192 
## F-statistic: 109.1 on 20 and 1309 DF,  p-value: < 2.2e-16
clm_s6=censReg(formula = data_c6$t_gross~data_c6$t_budget+data_c6$t_imdb_score+data_c6$t_rt_score+
                 data_c6$t_imdb_score2+data_c6$year+data_c6$c6_1+data_c6$c6_2+
                 data_c6$c6_3+data_c6$c6_4+data_c6$c6_5+data_c6$c6_6)
summary(clm_s6)
## 
## Call:
## censReg(formula = data_c6$t_gross ~ data_c6$t_budget + data_c6$t_imdb_score + 
##     data_c6$t_rt_score + data_c6$t_imdb_score2 + data_c6$year + 
##     data_c6$c6_1 + data_c6$c6_2 + data_c6$c6_3 + data_c6$c6_4 + 
##     data_c6$c6_5 + data_c6$c6_6)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           1330              1           1329              0 
## 
## Coefficients:
##                         Estimate Std. error t value  Pr(> t)    
## (Intercept)           -231.20984  809.64485  -0.286 0.775208    
## data_c6$t_budget         0.98660    0.02607  37.842  < 2e-16 ***
## data_c6$t_imdb_score   -73.96865   14.03893  -5.269 1.37e-07 ***
## data_c6$t_rt_score      13.94225    3.45777   4.032 5.53e-05 ***
## data_c6$t_imdb_score2    6.93755    1.20982   5.734 9.79e-09 ***
## data_c6$year2006       417.36904  810.73131   0.515 0.606689    
## data_c6$year2007       408.58067  810.74172   0.504 0.614290    
## data_c6$year2008       411.47693  810.73384   0.508 0.611779    
## data_c6$year2009       413.09014  810.74759   0.510 0.610389    
## data_c6$year2010       415.86186  810.75224   0.513 0.607998    
## data_c6$year2011       414.22537  810.74352   0.511 0.609407    
## data_c6$year2012       430.19563  810.75734   0.531 0.595689    
## data_c6$year2013       430.27326  810.75374   0.531 0.595621    
## data_c6$year2014       438.50428  810.75073   0.541 0.588603    
## data_c6$year2015       421.68073  810.76120   0.520 0.602991    
## data_c6$c6_1            16.52826    4.86652   3.396 0.000683 ***
## data_c6$c6_2            -4.68529    4.72530  -0.992 0.321426    
## data_c6$c6_3           -20.48595    6.69707  -3.059 0.002221 ** 
## data_c6$c6_4            -4.30921    5.14010  -0.838 0.401834    
## data_c6$c6_5            -6.84014    5.39199  -1.269 0.204592    
## data_c6$c6_6            25.45859    6.05539   4.204 2.62e-05 ***
## logSigma                 4.28595    0.01940 220.966  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Newton-Raphson maximisation, 16 iterations
## Return code 2: successive function values within tolerance limit
## Log-likelihood: -7581.796 on 22 Df

The results for our covariates of interest are very similar.